In [1]:
#Library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import nan
In [2]:
#Data inspection of dataframe
df_iris = pd.read_csv("iris.csv") #Read the clean data file
# Returns the first 3 rows, default 5
print("\n") #Blank row
print("First 3 rows in dataframe:\n"
,df_iris.iloc[:, 0:6].head(3))
# Checking data types of columns
print("\n") #Blank row
print("Data structure:\n")
df_iris.iloc[:, 0:6].info()
# Return data descriptive statistics(for numeric columns values)
print("\n") #Blank row
print("Data description:\n")
df_iris.iloc[:, 0:6].describe()
# Return data values
#df_iris.values
num_df = df_iris.iloc[:,1:5] #dataframe with only numeric colums, for correlation etc.
# Heatmap
#.1e = scientific notation with 1 decimal point (standard form)
#.2f = 2 decimal places
#.3g = 3 significant figures
#.4% = percentage with 4 decimal places
plt.figure(figsize=(10, 8))
sns.heatmap(num_df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.3f')
plt.title("Heatmap of Correlation Matrix with Fixed Axis")
plt.show()
# Histograms
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(
rows=(num_df.shape[1]//3)+1,
cols=3,
subplot_titles=num_df.columns
)
for i, col in enumerate(num_df.columns):
fig.add_trace(go.Histogram(x=num_df[col], name=col), row=(i//3)+1, col=(i%3)+1)
fig.update_layout(height=1000, showlegend=False)
fig.show()
First 3 rows in dataframe:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
Data structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 150 non-null int64
1 SepalLengthCm 150 non-null float64
2 SepalWidthCm 150 non-null float64
3 PetalLengthCm 150 non-null float64
4 PetalWidthCm 150 non-null float64
5 Species 150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
Data description:
In [3]:
# Data import - Introducing unclean data
df_iris = pd.read_csv("iris.csv") #Read the clean data file
print("\nOriginal data shape:", df_iris.shape) #dataframe dimensions(Nr of rows, Nr of columns)
# Introducing unclean data to the dataset, for the purpose for demonstrating data engineering
df_iris_unclean = pd.read_csv("unclean_iris.csv") #Read the unclean data file
df_concatenated = pd.concat([df_iris , df_iris_unclean], ignore_index=False) #merge both files
print("\nData shape after adding unclean data:",df_concatenated.shape)
Original data shape: (150, 6) Data shape after adding unclean data: (170, 6)
In [4]:
# Clean the data in numeric columns:
# 1. Data Type Errors - numeric columns have non-numeric data
# 2. Missing values in various columns
# Numeric columns to check
numeric_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
#print("DataFrame before cleaning:\n", df_concatenated)
# 1. Handle Data Type Errors (Convert to numeric, replace errors with NaN) # Convert specific numeric columns to numeric data types
errors_found = False
for col in numeric_cols:
try:
df_concatenated[col] = pd.to_numeric(df_concatenated[col])
except ValueError as e:
print(f"\nError converting '{col}' to numeric: {e}")
print(f"Non-numeric values found in '{col}'. Replacing with NaN.")
df_concatenated[col] = pd.to_numeric(df_concatenated[col], errors='coerce') #handle errors
errors_found = True
if errors_found:
print("\nMissing values after numeric conversion:\n", df_concatenated.isnull().sum())
# 2. Handle Missing Values (Remove rows with NaN)
initial_missing_count = df_concatenated.isnull().sum().sum() # Check for missing values before removing rows
print("\nMissing values before removing rows:", initial_missing_count)
df_concatenated.dropna(inplace=True) # Remove rows with any missing values (NaN), inplace=True to modify the DataFrame directly.
#df_concatenated = df_concatenated.dropna() # dropna() removes rows with at least one NaN value.
#df = df_concatenated # Assign the cleaned DataFrame to the 'df' variable, This ensures that the cleaned data is used in subsequent steps.
# 3. Verify that missing values have been removed.
if df_concatenated.isnull().sum().sum() == 0:
if initial_missing_count == 0:
print("\nNo missing values were present initially.")
else:
print("\nMissing values successfully removed.")
else:
print("\nWarning: Missing values still present after removal.")
print("\nRemaining missing values:\n", df_concatenated.isnull().sum())
print("\nData shape after removing NaN rows:", df_concatenated.shape)
Error converting 'PetalLengthCm' to numeric: Unable to parse string "error" at position 152 Non-numeric values found in 'PetalLengthCm'. Replacing with NaN. Missing values after numeric conversion: Id 0 SepalLengthCm 0 SepalWidthCm 2 PetalLengthCm 1 PetalWidthCm 1 Species 0 dtype: int64 Missing values before removing rows: 4 Missing values successfully removed. Data shape after removing NaN rows: (166, 6)
In [5]:
# Clean the data in str column:
# 1. Extra Whitespace in the 'Species' column (leading or trailing whitespace)
# 2. Inconsistent formatting in the 'Species' column (species names have variations in capitalization and spelling.)
# 3. Drop rows with empty strings or NaN values in 'Species'
# 1. Remove Whitespace in 'Species'
original_species = df_concatenated['Species'].astype(str)
cleaned_species = original_species.str.strip() #Remove whitespace from each string element in the current column, fast vectorized string operation
whitespace_removed = (original_species != cleaned_species).sum()
print(f"\nWhitespace removed from 'Species' column: {whitespace_removed}")
df_concatenated['Species'] = cleaned_species
# 2. Correct Inconsistent Formatting in 'Species'
print("\nUnique Species labels before formatting:\n", sorted(df_concatenated['Species'].unique()))
species_mapping = {
'Iris-vericolour': 'Iris-versicolor',
'Iris-VERSICOLOR': 'Iris-versicolor',
'Iris-vers': 'Iris-versicolor',
'Iris-SETOSA': 'Iris-setosa',
}
df_concatenated['Species'] = df_concatenated['Species'].replace(species_mapping)
# 3. Drop rows with empty strings or NaN values
df_concatenated = df_concatenated[df_concatenated['Species'].replace('', np.nan).notna()] # Use numpy.nan for consistency
# 4. Verify lable result
print("\nUnique Species labels after cleaning:\n", sorted(df_concatenated['Species'].unique()))
Whitespace removed from 'Species' column: 2 Unique Species labels before formatting: ['Iris-SETOSA', 'Iris-VERSICOLOR', 'Iris-setosa', 'Iris-vericolour', 'Iris-vers', 'Iris-versicolor', 'Iris-virginica'] Unique Species labels after cleaning: ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
In [6]:
# Clean the data: Remove duplicates rows
# Get a list of columns excluding the index column
columns_to_check = [col for col in df_concatenated.columns if col != 'Id']
# Store the original number of rows
original_row_count = len(df_concatenated)
# Drop duplicate rows based on the other columns
df_no_duplicates = df_concatenated.drop_duplicates(subset=columns_to_check) # drop_duplicates() checks all columns by default.
# Store the number of rows after duplicate removal
new_row_count = len(df_no_duplicates)
# Calculate the number of rows removed
rows_removed = original_row_count - new_row_count
print(f"Original number of rows: {original_row_count}")
print(f"Number of rows after duplicate removal: {new_row_count}")
print(f"Number of rows removed: {rows_removed}")
Original number of rows: 166 Number of rows after duplicate removal: 155 Number of rows removed: 11
In [7]:
# Feature Engineering for K-Nearest Neighbors (KNN)
# Label Encoding of 'Species' column
from sklearn.preprocessing import LabelEncoder
def encode_species(df):
df_copy = df.copy() # Create a copy within the function
label_encoder = LabelEncoder()
df_copy.loc[:, 'Species_encoded'] = label_encoder.fit_transform(df_copy['Species'])
return df_copy # Return the copy
# Example Usage (assuming df_no_duplicates is your DataFrame)
df_iris = encode_species(df_no_duplicates.copy()) # Create a copy before passing
# 4. Verify label result
# Print unique Species labels after encoding
print("\nUnique Species labels with encoded values:")
# Create a mapping dictionary for clarity
label_encoder = LabelEncoder()
label_encoder.fit(df_iris['Species'])
species_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
# Print unique labels and their encoded values
for species, encoded_value in species_mapping.items():
print(f"'{species}': {encoded_value}")
Unique Species labels with encoded values: 'Iris-setosa': 0 'Iris-versicolor': 1 'Iris-virginica': 2
In [8]:
# Train and evaluate K-Nearest Neighbors(KNN) model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
# Separate features (X) and target (y)
X = df_iris.drop(['Species', 'Species_encoded'], axis=1)
y = df_iris['Species_encoded']
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature Engineering: Scale features with StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Define the hyperparameter grid
param_grid = {
'n_neighbors': range(1, 21),
'metric': ['euclidean', 'manhattan', 'minkowski'],
'weights': ['uniform', 'distance'],
'p': [1, 2] # Only for Minkowski
}
# Create the KNN classifier
knn = KNeighborsClassifier()
# Perform grid search
grid_search = GridSearchCV(knn, param_grid, cv=5) # 5-fold cross-validation
grid_search.fit(X_train_scaled, y_train) # use scaled X_train
# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)
# Evaluate the model with the best hyperparameters
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train_scaled, y_train) # use scaled X_train
y_pred = best_knn.predict(X_test_scaled) # use scaled X_test
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("Precision:", precision_score(y_test, y_pred, average='weighted')) # Weighted average for multi-class
print("Recall:", recall_score(y_test, y_pred, average='weighted')) # Weighted average for multi-class
print("F1-Score:", f1_score(y_test, y_pred, average='weighted')) # Weighted average for multi-class
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Accuracy: 0.9375
Precision: 0.95
Recall: 0.9375
F1-Score: 0.9385683760683761
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 5
1 1.00 0.86 0.92 7
2 0.80 1.00 0.89 4
accuracy 0.94 16
macro avg 0.93 0.95 0.94 16
weighted avg 0.95 0.94 0.94 16
In [9]:
# Model data visualization
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
# Plot Confusion Matrix for KNN-model
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_) #use the label encoder that was used to create the encoded species.
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
# Plot Pair Plot for KNN-model
def create_pairplot(df):
sns.pairplot(df, hue='Species', vars=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'])
plt.show()
create_pairplot(df_iris)
In [ ]: